from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import numpy as np
import os
import pandas as pd
import math
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy
import matplotlib as mpl
from matplotlib.lines import Line2D
from matplotlib_venn import venn2
from mpl_toolkits.axes_grid1.inset_locator import InsetPosition
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib.patches import Circle, Wedge, Polygon
import csv
from matplotlib.patches import Patch
from matplotlib import pyplot
import pickle
from scipy.spatial import distance
from scipy import stats
from sklearn import manifold
from sklearn.decomposition import PCA
from scipy.cluster import hierarchy
import scipy.spatial.distance as ssd
folder = '/Users/robynwright/Documents/OneDrive/Github/PET-Plastisphere/3_isolates/e_global_distribution/TARA_16S/'
folder2 = '/Users/robynwright/Documents/OneDrive/Github/PET-Plastisphere/3_isolates/e_global_distribution/'
folder3 = '/Users/robynwright/Documents/OneDrive/Github/PET-Plastisphere/3_isolates/e_global_distribution/plastisphere_16S/'
bacillus_16S_sanger = 'Bacillus_16S_sanger.fna'
bacillus_16S_genome = 'Bacillus_16S_genome.fasta'
thioclava_16S_sanger = 'Thioclava_16S_sanger.fna'
thioclava_16S_genome = 'Thioclava_16S_genome.fasta'
mitag_folder = 'TARA_16SrRNA.miTAGs/'
blast_db_folder = 'blast_database/'
blast_out = 'blast_out/'
From Figshare:
wget https://ndownloader.figshare.com/articles/4902920/versions/1
mv 1 4902920.zip
unzip 4902920.zip
rm 4902920.zip
parallel -j 2 --link 'python /home/robyn/tools/quast-5.0.2/metaquast.py {} -r isolates/Bacillus_annotated.ffn --threads 12 -o quast_results/Bacillus_annotated_{/.}' \
::: TARA_assemblies/*
parallel -j 2 --link 'python /home/robyn/tools/quast-5.0.2/metaquast.py {} -r isolates/Bacillus_assembled.fasta --threads 12 -o quast_results/Bacillus_assembled_{/.}' \
::: TARA_assemblies/*
parallel -j 2 --link 'python /home/robyn/tools/quast-5.0.2/metaquast.py {} -r isolates/Thioclava_annotated.ffn --threads 12 -o quast_results/Thioclava_annotated_{/.}' \
::: TARA_assemblies/*
parallel -j 2 --link 'python /home/robyn/tools/quast-5.0.2/metaquast.py {} -r isolates/Thioclava_assembled.fasta --threads 12 -o quast_results/Thioclava_assembled_{/.}' \
::: TARA_assemblies/*
#tar -czvf name-of-archive.tar.gz /path/to/directory-or-file
mg_folder = folder2+'metagenome/all/'
folders = os.listdir(mg_folder)
coverage, sample = [], []
count = 0
for fol in folders:
if '.DS_Store' in fol: continue
count += 1
results = pd.read_csv(mg_folder+fol+'/summary/TSV/Genome_fraction.tsv', sep='\t', header=0, index_col=0)
column = results.columns[0]
row = results.index.values[0]
percent = results.loc[row, column]
sample.append(fol)
coverage.append(percent)
all_coverage = pd.DataFrame(coverage, index=sample, columns=['Coverage'])
all_coverage.to_csv(folder2+'metagenome/coverage.csv')
cmd = 'makeblastdb -in '+folder+'MiSeq_succession_sequences.fasta -dbtype nucl -out '+folder2+blast_db_folder+'miseq_succession'
os.system(cmd)
isolates = [bacillus_16S_sanger, bacillus_16S_genome, thioclava_16S_sanger, thioclava_16S_genome]
names = ['bacillus_sanger_', 'bacillus_genome_', 'thioclava_sanger_', 'thioclava_genome_']
folder_out = [blast_out+'bacillus/', blast_out+'bacillus/', blast_out+'thioclava/', blast_out+'thioclava/']
for a in range(len(isolates)):
db_name = folder+blast_db_folder+'miseq_succession'
query = folder+isolates[a]
out_fn = folder2+folder_out[a]+names[a]+'miseq_succession'+'.txt'
cmd = 'blastn -db '+db_name+' -query '+query+' -out '+out_fn+' -perc_identity 90 -outfmt 6 -max_target_seqs 20000'
os.system(cmd)
bacillus_matches = pd.read_csv(folder2+blast_out+'bacillus/'+'bacillus_genome_miseq_succession.txt', sep='\t', names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
thioclava_matches = pd.read_csv(folder2+blast_out+'thioclava/'+'thioclava_genome_miseq_succession.txt', sep='\t', names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
results = pd.DataFrame(thioclava_matches)
matches_90 = results[results.loc[:, 'pident'] > 90]
matches_95 = results[results.loc[:, 'pident'] > 95]
matches_97 = results[results.loc[:, 'pident'] > 97]
matches_99 = results[results.loc[:, 'pident'] > 99]
matches_90 = list(matches_90.loc[:, 'sseqid'].values)
matches_95 = list(matches_95.loc[:, 'sseqid'].values)
matches_97 = list(matches_97.loc[:, 'sseqid'].values)
matches_99 = list(matches_99.loc[:, 'sseqid'].values)
all_abundance = pd.read_csv(folder+'miseq_over_time_all.csv', header=0, index_col=0)
abundance_95 = list(all_abundance.loc[matches_95, :].sum(axis=0))
abundance_97 = list(all_abundance.loc[matches_97, :].sum(axis=0))
abundance_99 = list(all_abundance.loc[matches_99, :].sum(axis=0))
x = [1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 43, 44, 45, 46, 47, 48, 49]
days = [0, 1, 3, 7, 14, 21, 30, 42, 1, 3, 7, 14, 21, 30, 42, 1, 3, 7, 14, 21, 30, 42, 1, 3, 7, 14, 21, 30, 42, 1, 3, 7, 14, 21, 30, 42, 1, 3, 7, 14, 21, 30, 42]
plt.figure(figsize=(15,10))
ax1 = plt.subplot2grid((10,1), (0,0), frameon=False)
colormap = mpl.cm.get_cmap('plasma', 256)
norm_01 = mpl.colors.Normalize(vmin=0, vmax=0.3)
norm_05 = mpl.colors.Normalize(vmin=-0.3, vmax=0.9)
norm_5 = mpl.colors.Normalize(vmin=-2, vmax=3)
m_01 = mpl.cm.ScalarMappable(norm=norm_01, cmap=colormap)
m_05 = mpl.cm.ScalarMappable(norm=norm_05, cmap=colormap)
m_5 = mpl.cm.ScalarMappable(norm=norm_5, cmap=colormap)
abun = [abundance_95, abundance_97, abundance_99]
for a in range(len(x)):
for b in range(3):
if abun[b][a] < 0.1: color = m_01.to_rgba(abun[b][a])
elif abun[b][a] < 0.5: color = m_05.to_rgba(abun[b][a])
else: color = m_5.to_rgba(abun[b][a])
plt.bar(x[a], 1, bottom=b, width=1, edgecolor='k', color=color)
plt.xlim([0.5, 49.6]), plt.ylim([-0.1,3])
plt.xticks(x, days), plt.yticks([0.5, 1.5, 2.5], ['>95% identity', '>97% identity', '>99% identity'])
labels = ['Inoculum', 'No carbon', 'BHET', 'Amorphous\nPET biofilm', 'Amorphous\nPET planktonic', 'PET powder', 'Weathered PET powder']
locations = [1, 6, 14, 22, 30, 38, 46]
text_colors = ['k', 'y', 'orange', 'm', 'r', 'b', 'g']
for a in range(len(labels)):
plt.text(locations[a], 3.2, labels[a], ha='center', color=text_colors[a], fontweight='bold')
plt.text(26, -0.2, 'Days', ha='center')
plt.ylim([0.95,3])
plt.show()
# plt.savefig(folder2+'Thioclava_miseq_succession.png', bbox_inches='tight', dpi=600)
samples = all_abundance.columns
print('Maximum >95% identity Thioclava', max(abun[0]), 'in sample', samples[abun[0].index(max(abun[0]))])
## Maximum >95% identity Thioclava 4.132164142000001 in sample Day21BHET
print('Maximum >97% identity Thioclava', max(abun[1]), 'in sample', samples[abun[1].index(max(abun[1]))])
## Maximum >97% identity Thioclava 0.4181815610000001 in sample Day00Inoc
print('Maximum >99% identity Thioclava', max(abun[2]), 'in sample', samples[abun[2].index(max(abun[2]))])
## Maximum >99% identity Thioclava 0.26767257899999997 in sample Day00Inoc
bacillus_matches = pd.read_csv(folder2+blast_out+'bacillus/'+'bacillus_genome_miseq_succession.txt', sep='\t', names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
results = pd.DataFrame(bacillus_matches)
matches_90 = results[results.loc[:, 'pident'] > 90]
matches_95 = results[results.loc[:, 'pident'] > 95]
matches_97 = results[results.loc[:, 'pident'] > 97]
matches_99 = results[results.loc[:, 'pident'] > 99]
matches_90 = list(matches_90.loc[:, 'sseqid'].values)
matches_95 = list(matches_95.loc[:, 'sseqid'].values)
matches_97 = list(matches_97.loc[:, 'sseqid'].values)
matches_99 = list(matches_99.loc[:, 'sseqid'].values)
all_abundance = pd.read_csv(folder+'miseq_over_time_all.csv', header=0, index_col=0)
abundance_95 = list(all_abundance.loc[matches_95, :].sum(axis=0))
abundance_97 = list(all_abundance.loc[matches_97, :].sum(axis=0))
abundance_99 = list(all_abundance.loc[matches_99, :].sum(axis=0))
x = [1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 43, 44, 45, 46, 47, 48, 49]
days = [0, 1, 3, 7, 14, 21, 30, 42, 1, 3, 7, 14, 21, 30, 42, 1, 3, 7, 14, 21, 30, 42, 1, 3, 7, 14, 21, 30, 42, 1, 3, 7, 14, 21, 30, 42, 1, 3, 7, 14, 21, 30, 42]
plt.figure(figsize=(15,10))
ax1 = plt.subplot2grid((10,1), (0,0), frameon=False)
colormap = mpl.cm.get_cmap('plasma', 256)
norm_01 = mpl.colors.Normalize(vmin=0, vmax=0.3)
norm_05 = mpl.colors.Normalize(vmin=-0.3, vmax=0.9)
norm_5 = mpl.colors.Normalize(vmin=-2, vmax=3)
m_01 = mpl.cm.ScalarMappable(norm=norm_01, cmap=colormap)
m_05 = mpl.cm.ScalarMappable(norm=norm_05, cmap=colormap)
m_5 = mpl.cm.ScalarMappable(norm=norm_5, cmap=colormap)
abun = [abundance_95, abundance_97, abundance_99]
for a in range(len(x)):
for b in range(3):
if abun[b][a] < 0.1: color = m_01.to_rgba(abun[b][a])
elif abun[b][a] < 0.5: color = m_05.to_rgba(abun[b][a])
else: color = m_5.to_rgba(abun[b][a])
plt.bar(x[a], 1, bottom=b, width=1, edgecolor='k', color=color)
plt.xlim([0.5, 49.6]), plt.ylim([-0.1,3])
plt.xticks(x, days), plt.yticks([0.5, 1.5, 2.5], ['>95% identity', '>97% identity', '>99% identity'])
labels = ['Inoculum', 'No carbon', 'BHET', 'Amorphous\nPET biofilm', 'Amorphous\nPET planktonic', 'PET powder', 'Weathered PET powder']
locations = [1, 6, 14, 22, 30, 38, 46]
text_colors = ['k', 'y', 'orange', 'm', 'r', 'b', 'g']
for a in range(len(labels)):
plt.text(locations[a], 3.2, labels[a], ha='center', color=text_colors[a], fontweight='bold')
plt.text(26, -0.2, 'Days', ha='center')
plt.ylim([0.95,3])
plt.show()
# plt.savefig(folder2+'Bacillus_miseq_succession.png', bbox_inches='tight', dpi=600)
samples = all_abundance.columns
print('Maximum >95% identity Bacillus', max(abun[0]), 'in sample', samples[abun[0].index(max(abun[0]))])
## Maximum >95% identity Bacillus 17.313476744000003 in sample Day01NoC
print('Maximum >97% identity Bacillus', max(abun[1]), 'in sample', samples[abun[1].index(max(abun[1]))])
## Maximum >97% identity Bacillus 0.079681931 in sample Day07LowCrys
print('Maximum >99% identity Bacillus', max(abun[2]), 'in sample', samples[abun[2].index(max(abun[2]))])
## Maximum >99% identity Bacillus 0.079681931 in sample Day07LowCrys
Note that the data is presented separately here for searches with the 16S sequences from the whole genome sequences of both isolates as well as the 16S sequences from routine sanger sequencing.
bacillus_genome = pd.read_csv(folder+'bacillus_genome_16S_blast.csv', index_col=0, header=0)
bacillus_sanger = pd.read_csv(folder+'bacillus_sanger_16S_blast.csv', index_col=0, header=0)
thioclava_genome = pd.read_csv(folder+'thioclava_genome_16S_blast.csv', index_col=0, header=0)
thioclava_sanger = pd.read_csv(folder+'thioclava_sanger_16S_blast.csv', index_col=0, header=0)
sequences = pd.read_csv(folder+'sequences_per_sample.csv', index_col=0, header=0)
sample_locations = pd.read_csv(folder+'TARA_sample_locations_renamed.csv', index_col=0, header=0)
coverage = pd.read_csv(folder2+'metagenome/coverage.csv', index_col=0, header=0)
station_locations = {'004':'ANE', '150':'ANE', '151':'ANE', '152':'ANE', '141':'ANW', '142':'ANW', '145':'ANW', '146':'ANW', '148':'ANW', '149':'ANW', '066':'ASE', '067':'ASE', '068':'ASE', '070':'ASE', '072':'ASW', '076':'ASW', '078':'ASW', '082':'ASW', '036':'ION', '038':'ION', '039':'ION', '041':'ION', '042':'ION', '045':'ION', '048':'ION', '052':'IOS', '056':'IOS', '057':'IOS', '058':'IOS', '062':'IOS', '064':'IOS', '065':'IOS', '018':'MED', '023':'MED', '025':'MED', '030':'MED', '132':'PON', '133':'PON', '137':'PON', '138':'PON', '140':'PON', '100':'PSE', '102':'PSE', '109':'PSE', '110':'PSE', '111':'PSE', '093':'PSE', '094':'PSE', '096':'PSE', '098':'PSE', '099':'PSE', '112':'PSW', '122':'PSE', '123':'PSW', '124':'PSW', '125':'PSW', '128':'PSW', '031':'RED', '032':'RED', '033':'RED', '034':'RED', '084':'SOC', '085':'SOC'}
colormap = mpl.cm.get_cmap('PiYG', 256)
norm_01 = mpl.colors.Normalize(vmin=0, vmax=0.3)
norm_05 = mpl.colors.Normalize(vmin=-0.3, vmax=0.9)
norm_5 = mpl.colors.Normalize(vmin=-2, vmax=3)
m_01 = mpl.cm.ScalarMappable(norm=norm_01, cmap=colormap)
m_05 = mpl.cm.ScalarMappable(norm=norm_05, cmap=colormap)
m_5 = mpl.cm.ScalarMappable(norm=norm_5, cmap=colormap)
zero = '#6002BF'
def get_plot(ra, bc, bc1, sng, surfd, surfd1):
for row in ra.index.values:
seqs = sequences.loc[row, :].values[0]
ra.loc[row, :] = ra.loc[row, :].divide(seqs)*100
plt.figure(figsize=(16,16))
ax1 = plt.subplot2grid((3,1), (0,0), rowspan=2)
img = plt.imread(folder2+'world_map.jpg')
ax1.imshow(img, extent=[-180, 180, -90, 90], alpha=0.3)
samples = list(ra.index.values)
for a in range(len(samples)):
if surfd not in samples[a]: continue
loc = sample_locations.loc[samples[a], ['Longitude', 'Latitude']].values
if not isinstance(loc[0], float):
loc = loc[0]
mg_station = samples[a].split('_')[1]
if mg_station in station_locations:
mg_station = station_locations[mg_station]
sn = bc1+'_assembled_TARA_'+mg_station+'_RAW'
mg_coverage = coverage.loc[sn, 'Coverage']
else:
mg_coverage = 0
sample = ra.loc[samples[a], :].values
heatmap = [sample[1], sample[0], mg_coverage, sample[2]]
s = 4
x, y = [loc[0]-s, loc[0], loc[0]-s, loc[0]], [loc[1]-s, loc[1]-s, loc[1], loc[1]]
for b in range(len(heatmap)):
if heatmap[b] < 0.1: color = m_01.to_rgba(heatmap[b])
elif heatmap[b] < 0.5: color = m_05.to_rgba(heatmap[b])
else: color = m_5.to_rgba(heatmap[b])
ax1.bar(x[b], height=s, bottom=y[b], color=color, edgecolor='k', width=s)
s=6
loc = [130, -55]
x, y = [loc[0]-s, loc[0], loc[0]-s, loc[0]], [loc[1]-s, loc[1]-s, loc[1], loc[1]]
for b in range(4):
ax1.bar(x[b], height=s, bottom=y[b], color='w', edgecolor='k', width=s)
ax1.text(x[0]-(s/2)-2, y[0]+(s/2), '>95% identity', ha='right', va='center')
ax1.text(x[1]+(s/2)+2, y[1]+(s/2), '>90% identity', ha='left', va='center')
ax1.text(x[1]+(s/2)+2, y[2]+(s/2), '>97% identity', ha='left', va='center')
ax1.text(x[0]-(s/2)-2, y[3]+(s/2), 'Metagenome coverage', ha='right', va='center')
plt.xticks([]), plt.yticks([])
plt.title(bc+' relative abundance in '+surfd1+' waters')
axins1 = inset_axes(ax1, width="20%", height="5%", loc='lower right', borderpad=4)
cb1 = mpl.colorbar.ColorbarBase(axins1, cmap=colormap, norm=norm_01, orientation='horizontal')
plt.sca(axins1)
plt.xticks([0, 0.1, 0.2, 0.3], [0, 0.1, 0.5, 3])
plt.xlabel('%')
relabun = pd.DataFrame(bacillus_genome)
bac, bac1, sg, sd, sd1 = '$Bacillus$ sp. BHET2', 'Bacillus', 'genome', 'SRF', 'surface'
get_plot(relabun, bac, bac1, sg, sd, sd1)
## <string>:52: UserWarning: Use the colorbar set_ticks() method instead.
plt.show()
# plt.savefig(folder+bac1+'_'+sg+'_'+sd1+'.png', bbox_inches='tight', dpi=600)
relabun = pd.DataFrame(bacillus_genome)
bac, bac1, sg, sd, sd1 = '$Bacillus$ sp. BHET2', 'Bacillus', 'genome', 'DCM', 'deep'
get_plot(relabun, bac, bac1, sg, sd, sd1)
relabun.to_csv(folder+'bacillus_relabun.csv')
plt.show()
# plt.savefig(folder+bac1+'_'+sg+'_'+sd1+'.png', bbox_inches='tight', dpi=600)
relabun = pd.DataFrame(bacillus_sanger)
bac, bac1, sg, sd, sd1 = '$Bacillus$ sp. BHET2', 'Bacillus', 'sanger', 'SRF', 'surface'
get_plot(relabun, bac, bac1, sg, sd, sd1)
plt.show()
# plt.savefig(folder+bac1+'_'+sg+'_'+sd1+'.png', bbox_inches='tight', dpi=600)
relabun = pd.DataFrame(bacillus_sanger)
bac, bac1, sg, sd, sd1 = '$Bacillus$ sp. BHET2', 'Bacillus', 'sanger', 'DCM', 'deep'
get_plot(relabun, bac, bac1, sg, sd, sd1)
plt.show()
# plt.savefig(folder+bac1+'_'+sg+'_'+sd1+'.png', bbox_inches='tight', dpi=600)
relabun = pd.DataFrame(thioclava_genome)
bac, bac1, sg, sd, sd1 = '$Thioclava$ sp. BHET1', 'Thioclava', 'genome', 'SRF', 'surface'
get_plot(relabun, bac, bac1, sg, sd, sd1)
relabun.to_csv(folder+'thioclava_relabun.csv')
plt.show()
# plt.savefig(folder+bac1+'_'+sg+'_'+sd1+'.png', bbox_inches='tight', dpi=600)
relabun = pd.DataFrame(thioclava_genome)
bac, bac1, sg, sd, sd1 = '$Thioclava$ sp. BHET1', 'Thioclava', 'genome', 'DCM', 'deep'
get_plot(relabun, bac, bac1, sg, sd, sd1)
plt.show()
# plt.savefig(folder+bac1+'_'+sg+'_'+sd1+'.png', bbox_inches='tight', dpi=600)
relabun = pd.DataFrame(thioclava_sanger)
bac, bac1, sg, sd, sd1 = '$Thioclava$ sp. BHET1', 'Thioclava', 'sanger', 'SRF', 'surface'
get_plot(relabun, bac, bac1, sg, sd, sd1)
plt.show()
# plt.savefig(folder+bac1+'_'+sg+'_'+sd1+'.png', bbox_inches='tight', dpi=600)
relabun = pd.DataFrame(thioclava_sanger)
bac, bac1, sg, sd, sd1 = '$Thioclava$ sp. BHET1', 'Thioclava', 'sanger', 'DCM', 'deep'
get_plot(relabun, bac, bac1, sg, sd, sd1)
plt.show()
# plt.savefig(folder+bac1+'_'+sg+'_'+sd1+'.png', bbox_inches='tight', dpi=600)
cmd = 'makeblastdb -in '+folder3+'dna-sequences.fasta -dbtype nucl -out '+folder2+blast_db_folder+'plastisphere_ma'
os.system(cmd)
isolates = [bacillus_16S_sanger, bacillus_16S_genome, thioclava_16S_sanger, thioclava_16S_genome]
names = ['bacillus_sanger_', 'bacillus_genome_', 'thioclava_sanger_', 'thioclava_genome_']
folder_out = [blast_out+'bacillus/', blast_out+'bacillus/', blast_out+'thioclava/', blast_out+'thioclava/']
for a in range(len(isolates)):
db_name = folder2+blast_db_folder+'plastisphere_ma'
query = folder+isolates[a]
out_fn = folder2+folder_out[a]+names[a]+'plastisphere_ma'+'.txt'
cmd = 'blastn -db '+db_name+' -query '+query+' -out '+out_fn+' -perc_identity 90 -outfmt 6 -max_target_seqs 20000'
os.system(cmd)
Note that these files are zipped in the github folders and will need to be unzipped prior to use.
ft = pd.read_csv(folder3+'feature-table_w_tax.txt', header=1, index_col=0, sep='\t')
ft = ft.drop(['taxonomy'], axis=1)
ft = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
ft.to_csv(folder3+'feature_table_rel_abun.csv')
To only ASVs identified in the BLAST searches and only marine samples:
ft = pd.read_csv(folder3+'feature_table_rel_abun.csv', header=0, index_col=0)
bacillus_matches = pd.read_csv(folder2+blast_out+'bacillus/'+'bacillus_genome_plastisphere_ma.txt', sep='\t', names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
thioclava_matches = pd.read_csv(folder2+blast_out+'thioclava/'+'thioclava_genome_plastisphere_ma.txt', sep='\t', names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
all_asv = list(bacillus_matches.loc[:, 'sseqid'].values)+list(thioclava_matches.loc[:, 'sseqid'].values)
ft = ft.loc[all_asv, :]
md = pd.read_csv(folder3+'metadata.txt', header=0, index_col=0, sep='\t')
keeping = []
for sample in ft.columns:
if md.loc[sample, 'Environment'] == 'marine':
keeping.append(True)
else:
keeping.append(False)
ft_marine = ft.loc[:, keeping]
unique = []
rename_ft_plastic = {}
other = ['rubber', 'pla', 'san', 'phbv', 'epoxy', 'pa', 'polyester', 'organic', 'blank', 'sediment', 'positive']
for sample in md.index.values:
if md.loc[sample, 'PlasticTypeSpecific'] in other:
rename_ft_plastic[sample] = 'other'
if 'other' not in unique:
unique.append('other')
else:
rename_ft_plastic[sample] = md.loc[sample, 'PlasticTypeSpecific']
if md.loc[sample, 'PlasticTypeSpecific'] not in unique:
unique.append(md.loc[sample, 'PlasticTypeSpecific'])
def get_plot(matches, ax, limit=97):
ft_plastic = ft_marine.rename(columns=rename_ft_plastic)
above = matches[matches.loc[:, 'pident'] > limit]
above_asv = list(above.loc[:, 'sseqid'].values)
ft_plastic_bac = ft_plastic.loc[above_asv, :]
plastic_bac_sum = pd.DataFrame(ft_plastic_bac.sum(axis=0)).transpose().rename(index={0:'Sum'})
groups = list(set(plastic_bac_sum.columns))
groups = ['other', 'water', 'not_plastic', 'plastic', 'pvc', 'ps', 'pp', 'pe', 'pet']
groups.reverse()
group_plots = []
for group in groups:
this_group = plastic_bac_sum.loc['Sum', group]
if not isinstance(this_group, float):
group_plots.append(plastic_bac_sum.loc['Sum', group].values)
else:
group_plots.append([this_group])
new_groups = ['Other', 'Water', 'Control biofilm', 'Unidentified plastic', 'PVC', 'PS', 'PP', 'PE', 'PET']
new_groups.reverse()
plt.sca(ax)
plt.boxplot(group_plots, labels=new_groups)
plt.xticks(rotation=90)
if ax == ax1:
plt.ylabel('Relative abundance (%)')
return
Plot in sample types:
plt.figure(figsize=(10,5))
ax1, ax2, ax3, ax4 = plt.subplot(141), plt.subplot(142), plt.subplot(143), plt.subplot(144)
axes = [ax1, ax2, ax3, ax4]
match = [thioclava_matches, thioclava_matches, bacillus_matches, bacillus_matches, bacillus_matches]
limits = [97, 99, 97, 99]
titles = ['$Thioclava$ sp. BHET1\n>97% identity', '$Thioclava$ sp. BHET1\n>99% identity', '$Bacillus$ sp. BHET2\n>97% identity', '$Bacillus$ sp. BHET2\n>99% identity']
for a in range(len(axes)):
get_plot(match[a], axes[a], limit=limits[a])
plt.sca(axes[a])
# if a != 0:
# plt.yticks([])
plt.title(titles[a])
plt.tight_layout()
plt.show()
# plt.savefig(folder3+'Both abundance in plastics.png', dpi=600, bbox_inches='tight')
def get_plot_scatter(matches, ax, limit=97):
ft_plastic = ft_marine.rename(columns=rename_ft_plastic)
above = matches[matches.loc[:, 'pident'] > limit]
above_asv = list(above.loc[:, 'sseqid'].values)
ft_plastic_bac = ft_plastic.loc[above_asv, :]
plastic_bac_sum = pd.DataFrame(ft_plastic_bac.sum(axis=0)).transpose().rename(index={0:'Sum'})
groups = list(set(plastic_bac_sum.columns))
groups = ['other', 'water', 'not_plastic', 'plastic', 'pvc', 'ps', 'pp', 'pe', 'pet']
groups.reverse()
group_plots = []
for group in groups:
this_group = plastic_bac_sum.loc['Sum', group]
if not isinstance(this_group, float):
group_plots.append(plastic_bac_sum.loc['Sum', group].values)
else:
group_plots.append([this_group])
new_groups = ['Other', 'Water', 'Control biofilm', 'Unidentified plastic', 'PVC', 'PS', 'PP', 'PE', 'PET']
new_groups.reverse()
plt.sca(ax)
x = []
for g in range(len(groups)):
x.append(g)
pltx = np.random.normal(g, scale=0.1, size=len(group_plots[g]))
# pltx = [g for x in range()]
plt.scatter(pltx, group_plots[g], alpha=0.2, s=15)
lq, uq = np.quantile(group_plots[g], 0.25), np.quantile(group_plots[g], 0.75)
quantiles = [[lq], [uq]]
plt.errorbar(g, np.mean(group_plots[g]), color='k', marker='o', capsize=2)
# plt.boxplot(group_plots, labels=new_groups)
plt.xticks(x, new_groups, rotation=90)
if ax == ax1:
plt.ylabel('Relative abundance (%)')
return lq, uq
plt.figure(figsize=(3,10))
ax1, ax2 = plt.subplot(211), plt.subplot(212)
axes = [ax1, ax2]
match = [thioclava_matches, thioclava_matches]
limits = [99, 97]
titles = ['>99% identity', '>97% identity']
for a in range(len(axes)):
lq, uq = get_plot_scatter(match[a], axes[a], limit=limits[a])
print(lq, uq)
plt.sca(axes[a])
if a == 0:
plt.xticks([])
plt.ylabel('Relative abundance (%)')
axes[a].text(.5,.95,titles[a], ha='center', transform=axes[a].transAxes, fontsize=12)
plt.tight_layout()
plt.show()
# plt.savefig(folder3+'Thioclava abundance in plastics.png', dpi=600, bbox_inches='tight')
plt.figure(figsize=(3,10))
ax1, ax2 = plt.subplot(211), plt.subplot(212)
axes = [ax1, ax2]
match = [bacillus_matches, bacillus_matches]
limits = [99, 97]
titles = ['>99% identity', '>97% identity']
for a in range(len(axes)):
get_plot_scatter(match[a], axes[a], limit=limits[a])
plt.sca(axes[a])
if a == 0:
plt.xticks([])
plt.ylabel('Relative abundance (%)')
axes[a].text(.5,.95,titles[a], ha='center', transform=axes[a].transAxes, fontsize=12)
plt.tight_layout()
plt.show()
# plt.savefig(folder3+'Bacillus abundance in plastics.png', dpi=600, bbox_inches='tight')
Note that here samples are grouped to latitude/longitude multiples of 5. This can be changed by changing the r_base value. Map plotting function:
def plot_map(matches, ax1=None, cmap='plasma', legend=True, r_base=5):
colormap = mpl.cm.get_cmap(cmap, 256)
norm_01 = mpl.colors.Normalize(vmin=0, vmax=0.3)
norm_05 = mpl.colors.Normalize(vmin=-0.3, vmax=0.9)
norm_5 = mpl.colors.Normalize(vmin=-2, vmax=3)
m_01 = mpl.cm.ScalarMappable(norm=norm_01, cmap=colormap)
m_05 = mpl.cm.ScalarMappable(norm=norm_05, cmap=colormap)
m_5 = mpl.cm.ScalarMappable(norm=norm_5, cmap=colormap)
if ax1 == None:
plt.figure(figsize=(16,16))
ax1 = plt.subplot2grid((3,1), (0,0), rowspan=2)
img = plt.imread(folder2+'world_map.jpg')
plt.sca(ax1)
ax1.imshow(img, extent=[-180, 180, -90, 90], alpha=0.3)
above_97 = matches[matches.loc[:, 'pident'] > 97]
above_asv_97 = list(above_97.loc[:, 'sseqid'].values)
above_99 = matches[matches.loc[:, 'pident'] > 99]
above_asv_99 = list(above_99.loc[:, 'sseqid'].values)
# ft_plastic = ft_marine.rename(columns=rename_ft_plastic)
# plastic_bac_sum = pd.DataFrame(ft_plastic_bac.sum(axis=0)).transpose().rename(index={0:'Sum'})
ft_bac_97 = ft_marine.loc[above_asv_97, :]
ft_bac_99 = ft_marine.loc[above_asv_99, :]
ft_bac_97 = pd.DataFrame(ft_bac_97.sum(axis=0)).transpose().rename(index={0:'Sum'})
ft_bac_99 = pd.DataFrame(ft_bac_99.sum(axis=0)).transpose().rename(index={0:'Sum'})
samples = ft_bac_97.columns
location = {}
for sample in samples:
#longitude = x axis, latitude = y axis
loc = md.loc[sample, ['Longitude', 'Latitude']]
for l in range(len(loc)):
loc[l] = r_base * round(loc[l]/r_base)
location[sample] = str(loc[0])+', '+str(loc[1])
ft_loc_97 = ft_bac_97.rename(columns=location)
ft_loc_99 = ft_bac_99.rename(columns=location)
ft_loc_97 = ft_loc_97.groupby(by=ft_loc_97.columns, axis=1).max()
ft_loc_99 = ft_loc_99.groupby(by=ft_loc_99.columns, axis=1).max()
locations = ft_loc_97.columns
for loc in locations:
loc_string = str(loc)
loc = loc.split(',')
loc[0], loc[1] = int(loc[0]), int(loc[1])
r = 2.5
nums = [ft_loc_97.loc['Sum', loc_string], ft_loc_99.loc['Sum', loc_string]]
colors = []
print(nums)
for n in nums:
if n < 0.1: n = m_01.to_rgba(n)
elif n < 0.5: n = m_05.to_rgba(n)
else: n = m_5.to_rgba(n)
colors.append(n)
wedge1 = Wedge((loc[0], loc[1]), r, 90, -90, facecolor=colors[0], edgecolor='k')
wedge2 = Wedge((loc[0], loc[1]), r, -90, 90, facecolor=colors[1], edgecolor='k')
ax1.add_artist(wedge1)
ax1.add_artist(wedge2)
if legend:
s=4
loc = [140, 55]
wedge1 = Wedge((loc[0], loc[1]), s, 90, -90, facecolor='w', edgecolor='k')
wedge2 = Wedge((loc[0], loc[1]), s, -90, 90, facecolor='w', edgecolor='k')
ax1.add_artist(wedge1)
ax1.add_artist(wedge2)
ax1.text(loc[0]+5, loc[1], '>99% identity', ha='left', va='center')
ax1.text(loc[0]-5, loc[1], '>97% identity', ha='right', va='center')
axins1 = inset_axes(ax1, width="20%", height="5%", loc='upper right', borderpad=1)
cb1 = mpl.colorbar.ColorbarBase(axins1, cmap=colormap, norm=norm_01, orientation='horizontal')
plt.sca(axins1)
plt.xticks([0, 0.1, 0.2, 0.3], [0, 0.1, 0.5, 3])
plt.xlabel('Relative abundance (%)')
plt.sca(ax1)
plt.xticks([]), plt.yticks([])
return
Distribution Thioclava:
plot_map(thioclava_matches)
plt.show()
# plt.savefig(folder3+'Thioclava distribution.png', dpi=600, bbox_inches='tight')
Note that here samples are grouped to latitude/longitude multiples of 5. This can be changed by changing the r_base value in the above function. Distribution Bacillus:
plot_map(bacillus_matches)
plt.show()
# plt.savefig(folder3+'Bacillus distribution.png', dpi=600, bbox_inches='tight')